library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(corrplot)
## corrplot 0.84 loaded
library("PerformanceAnalytics")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library(gtsummary)
## #Uighur
library(flextable)
##
## Attaching package: 'flextable'
## The following object is masked from 'package:gtsummary':
##
## as_flextable
## The following object is masked from 'package:purrr':
##
## compose
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
##
## kurtosis, skewness
food_data <- read_csv("Food_Supply_kcal_Data.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## Country = col_character(),
## Undernourished = col_character(),
## `Unit (all except Population)` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
head(food_data)
## # A tibble: 6 x 32
## Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 0 4.78 0.850 0
## 2 Albania 0.912 16.1 1.06 0
## 3 Algeria 0.0896 6.03 0.194 0
## 4 Angola 1.94 4.69 0.264 0
## 5 Antigu… 2.30 15.4 1.54 0
## 6 Argent… 1.44 15.0 1.06 0
## # … with 27 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## # `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## # Excluding Butter` <dbl>, Miscellaneous <dbl>, Offals <dbl>, Oilcrops <dbl>,
## # Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar
## # Crops` <dbl>, `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal
## # Products` <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>, Obesity <dbl>,
## # Undernourished <chr>, Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Population <dbl>, `Unit (all except Population)` <chr>
sum(food_data[1,2:24])
## [1] 100.0002
food_data <- food_data %>%
drop_na() %>%
mutate(Confirmed_per_capita = Confirmed/Population * 100000000, Deaths_per_capita = Deaths/Population * 100000000) %>%
#creating a column that specifies whether each country falls above or below the median
mutate(Cases_vs_median = as.character(ifelse(Confirmed_per_capita > median(Confirmed_per_capita), "Above", "Below")),
Deaths_vs_median = as.character(ifelse(Deaths_per_capita > median(Deaths_per_capita), "Above", "Below")))
food_data <- food_data %>%
select(-Miscellaneous, -Obesity, -Undernourished, -Recovered, -Active, -Population, -`Unit (all except Population)`,
-Confirmed, -Deaths)
head(food_data)
## # A tibble: 6 x 27
## Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 0 4.78 0.850 0
## 2 Albania 0.912 16.1 1.06 0
## 3 Algeria 0.0896 6.03 0.194 0
## 4 Angola 1.94 4.69 0.264 0
## 5 Argent… 1.44 15.0 1.06 0
## 6 Armenia 0.227 12.8 1.77 0
## # … with 22 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## # `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## # Excluding Butter` <dbl>, Offals <dbl>, Oilcrops <dbl>, Pulses <dbl>,
## # Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar Crops` <dbl>,
## # `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal Products` <dbl>,
## # `Vegetable Oils` <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## # Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
corrplot(is.corr=TRUE, cor(food_data[,2:24], use="pairwise.complete.obs"), method="number")
chart.Correlation(food_data[,2:24], histogram=TRUE)
# summary statistics
food_data_1 <- food_data %>%
select(-Deaths_vs_median, -Country)
tbl_summary(data = food_data_1, by = "Cases_vs_median",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n}({p}%)")) %>%
add_n() %>%
add_p(test = all_continuous() ~ "aov") %>%
as_flex_table() %>%
bold(part = "header")
Characteristic | N | Above, N = 771 | Below, N = 771 | p-value2 |
Alcoholic Beverages | 154 | 1.76 (1.15) | 0.86 (0.78) | <0.001 |
Animal Products | 154 | 11.5 (4.1) | 6.6 (4.2) | <0.001 |
Animal fats | 154 | 1.77 (1.50) | 0.76 (0.85) | <0.001 |
Aquatic Products, Other | 154 | 0.12 | ||
0 | 77(100%) | 73(95%) | ||
0.0185 | 0(0%) | 1(1.3%) | ||
0.0209 | 0(0%) | 1(1.3%) | ||
0.0336 | 0(0%) | 1(1.3%) | ||
0.4007 | 0(0%) | 1(1.3%) | ||
Cereals - Excluding Beer | 154 | 18 (5) | 23 (7) | <0.001 |
Eggs | 154 | 0.52 (0.26) | 0.32 (0.32) | <0.001 |
Fish, Seafood | 154 | 0.61 (0.60) | 0.57 (0.50) | 0.7 |
Fruits - Excluding Wine | 154 | 2.21 (1.46) | 1.80 (1.43) | 0.082 |
Meat | 154 | 4.44 (1.68) | 2.93 (2.25) | <0.001 |
Milk - Excluding Butter | 154 | 3.99 (1.84) | 1.89 (1.72) | <0.001 |
Offals | 154 | 0.15 (0.09) | 0.14 (0.13) | 0.6 |
Oilcrops | 154 | 0.66 (0.83) | 1.43 (1.85) | 0.001 |
Pulses | 154 | 0.79 (0.73) | 1.48 (1.52) | <0.001 |
Spices | 154 | 0.18 (0.24) | 0.19 (0.25) | >0.9 |
Starchy Roots | 154 | 2.2 (2.4) | 4.3 (5.0) | 0.001 |
Stimulants | 154 | 0.46 (0.36) | 0.15 (0.16) | <0.001 |
Sugar Crops | 154 | 0.00 (0.01) | 0.04 (0.10) | 0.005 |
Sugar & Sweeteners | 154 | 5.62 (1.63) | 3.92 (2.22) | <0.001 |
Treenuts | 154 | 0.32 (0.29) | 0.22 (0.29) | 0.035 |
Vegetal Products | 154 | 38.5 (4.1) | 43.4 (4.2) | <0.001 |
Vegetable Oils | 154 | 5.09 (2.17) | 4.66 (2.19) | 0.2 |
Vegetables | 154 | 1.20 (0.63) | 0.93 (0.63) | 0.008 |
Confirmed_per_capita | 154 | 161 (282) | 1 (2) | <0.001 |
Deaths_per_capita | 154 | 2.21 (3.74) | 0.03 (0.05) | <0.001 |
1Mean (SD); n(%) | ||||
2One-way ANOVA; Fisher's exact test | ||||
food_data_2 <- food_data %>%
select(-Cases_vs_median, -Country)
tbl_summary(data = food_data_2, by = "Deaths_vs_median",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n}({p}%)")) %>%
add_n() %>%
add_p(test = all_continuous() ~ "aov") %>%
as_flex_table() %>%
bold(part = "header")
Characteristic | N | Above, N = 771 | Below, N = 771 | p-value2 |
Alcoholic Beverages | 154 | 1.75 (1.15) | 0.88 (0.80) | <0.001 |
Animal Products | 154 | 11.4 (4.2) | 6.7 (4.2) | <0.001 |
Animal fats | 154 | 1.78 (1.50) | 0.75 (0.83) | <0.001 |
Aquatic Products, Other | 154 | 0.12 | ||
0 | 77(100%) | 73(95%) | ||
0.0185 | 0(0%) | 1(1.3%) | ||
0.0209 | 0(0%) | 1(1.3%) | ||
0.0336 | 0(0%) | 1(1.3%) | ||
0.4007 | 0(0%) | 1(1.3%) | ||
Cereals - Excluding Beer | 154 | 18 (5) | 23 (7) | <0.001 |
Eggs | 154 | 0.53 (0.26) | 0.31 (0.32) | <0.001 |
Fish, Seafood | 154 | 0.60 (0.60) | 0.58 (0.50) | 0.8 |
Fruits - Excluding Wine | 154 | 2.08 (1.37) | 1.94 (1.54) | 0.5 |
Meat | 154 | 4.38 (1.74) | 2.99 (2.24) | <0.001 |
Milk - Excluding Butter | 154 | 3.98 (1.84) | 1.90 (1.73) | <0.001 |
Offals | 154 | 0.14 (0.08) | 0.14 (0.14) | >0.9 |
Oilcrops | 154 | 0.68 (0.94) | 1.42 (1.81) | 0.002 |
Pulses | 154 | 0.78 (0.68) | 1.49 (1.54) | <0.001 |
Spices | 154 | 0.17 (0.23) | 0.20 (0.25) | 0.5 |
Starchy Roots | 154 | 2.0 (2.3) | 4.5 (5.0) | <0.001 |
Stimulants | 154 | 0.46 (0.36) | 0.15 (0.17) | <0.001 |
Sugar Crops | 154 | 0.00 (0.01) | 0.04 (0.10) | 0.004 |
Sugar & Sweeteners | 154 | 5.67 (1.65) | 3.86 (2.16) | <0.001 |
Treenuts | 154 | 0.31 (0.28) | 0.22 (0.30) | 0.045 |
Vegetal Products | 154 | 38.6 (4.2) | 43.3 (4.2) | <0.001 |
Vegetable Oils | 154 | 5.24 (2.20) | 4.52 (2.12) | 0.041 |
Vegetables | 154 | 1.19 (0.64) | 0.94 (0.62) | 0.013 |
Confirmed_per_capita | 154 | 157 (283) | 5 (27) | <0.001 |
Deaths_per_capita | 154 | 2.21 (3.74) | 0.03 (0.04) | <0.001 |
1Mean (SD); n(%) | ||||
2One-way ANOVA; Fisher's exact test | ||||
# prepare data for cross validation
food_data <- food_data %>%
# delete country since only each country has only one observation
select(-`Animal Products`, -Country)
food_data %>% head()
## # A tibble: 6 x 25
## `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 20 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## # Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## # Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## # Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## # Treenuts <dbl>, `Vegetal Products` <dbl>, `Vegetable Oils` <dbl>,
## # Vegetables <dbl>, Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>,
## # Cases_vs_median <chr>, Deaths_vs_median <chr>
# numeric
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Confirmed_per_capita, k=10)
result_lm <- list()
result_knn <- list()
result_lasso <- list()
result_ridge <- list()
for(f in 1:length(cv_folds_10)){
food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
lm_fit = lm(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10)
knn_fit = train(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10,
method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
trControl = trainControl(method="cv", number = 5))
lasso_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,22]), alpha=1)
ridge_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,22]), alpha=0)
#linear
food_data_test_cv_10$Confirmed_lm_predict <- predict(lm_fit, newdata = food_data_test_cv_10)
result_lm[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lm_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_lm_predict)
#knn
food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
result_knn[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_knn_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_knn_predict)
#lasso
food_data_test_cv_10 = as.data.frame(food_data_test_cv_10)
food_data_test_cv_10$Confirmed_lasso_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(lasso_fit, s = "lambda.min")
result_lasso[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lasso_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_lasso_predict)
#ridge
food_data_test_cv_10$Confirmed_ridge_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(ridge_fit, s = "lambda.min")
result_ridge[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_ridge_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_ridge_predict)
}
data.frame_lm = data.frame("CV_error" = mean(unlist(result_lm)), "CV_error_SE" = sd(unlist(result_lm)))
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))
rbind("Linear Regression" = data.frame_lm, "KNN" = data.frame_knn, "Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
## CV_error CV_error_SE
## Linear Regression 61949.59 83274.45
## KNN 38668.35 43959.12
## Lasso 26944.69 29722.09
## Ridge 34409.81 40599.06
# categorical
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Cases_vs_median, k=10)
result_logistic <- list()
result_knn <- list()
result_linear_svm <- list()
result_radial_svm <- list()
result_polynomial_svm <- list()
food_data %>% head()
## # A tibble: 6 x 25
## `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 20 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## # Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## # Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## # Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## # Treenuts <dbl>, `Vegetal Products` <dbl>, `Vegetable Oils` <dbl>,
## # Vegetables <dbl>, Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>,
## # Cases_vs_median <chr>, Deaths_vs_median <chr>
food_data <- food_data %>%
mutate(Cases_vs_median = as.factor(ifelse(Cases_vs_median == "Above",1,0)),
Deaths_vs_median = as.factor(ifelse(Deaths_vs_median == "Above",1,0)))
for(f in 1:length(cv_folds_10)){
food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
knn_fit <- train(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10,
method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
trControl = trainControl(method="cv", number = 5))
logistic_fit <- glm(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, family = binomial())
svr_linear_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="linear")
linear_svm_fit <- svr_linear_tune$best.model
svr_radial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="radial")
radial_svm_fit <- svr_radial_tune$best.model
svr_polynomial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="polynomial")
polynomial_svm_fit <- svr_polynomial_tune$best.model
# knn
food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
result_knn[[f]] <- 1-confusionMatrix(data = food_data_test_cv_10$Confirmed_knn_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_knn_predict)
#logistic
food_data_test_cv_10$Confirmed_logistic_predict_num <- predict(logistic_fit, newdata = food_data_test_cv_10, type="response", na.action=na.exclude)
food_data_test_cv_10$Confirmed_logistic_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_logistic_predict_num >= 0.5, 1, 0))
result_logistic[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_logistic_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_logistic_predict, -Confirmed_logistic_predict_num )
# linear svm
food_data_test_cv_10$Confirmed_linear_svm_predict <- predict(linear_svm_fit, newdata = food_data_test_cv_10)
result_linear_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_linear_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_linear_svm_predict)
# radial svm
food_data_test_cv_10$Confirmed_radial_svm_predict <- predict(radial_svm_fit, newdata = food_data_test_cv_10)
result_radial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_radial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_radial_svm_predict)
# polynomial svm
food_data_test_cv_10$Confirmed_polynomial_svm_predict <- predict(polynomial_svm_fit, newdata = food_data_test_cv_10)
result_polynomial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_polynomial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_polynomial_svm_predict)
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_logistic = data.frame("CV_error" = mean(unlist(result_logistic)), "CV_error_SE" = sd(unlist(result_logistic)))
data.frame_linear_svm = data.frame("CV_error" = mean(unlist(result_linear_svm)), "CV_error_SE" = sd(unlist(result_linear_svm)))
data.frame_radial_svm = data.frame("CV_error" = mean(unlist(result_radial_svm)), "CV_error_SE" = sd(unlist(result_radial_svm)))
data.frame_polynomial_svm = data.frame("CV_error" = mean(unlist(result_polynomial_svm)), "CV_error_SE" = sd(unlist(result_polynomial_svm)))
rbind("KNN" = data.frame_knn, "Logistic Regression" = data.frame_logistic, "Linear SVM" = data.frame_linear_svm, "Radial SVM" = data.frame_radial_svm, "Polynomial SVM" = data.frame_polynomial_svm)
## CV_error CV_error_SE
## KNN 0.2301190 0.09472007
## Logistic Regression 0.2154167 0.08665864
## Linear SVM 0.2016667 0.08239447
## Radial SVM 0.2417857 0.07898161
## Polynomial SVM 0.2536905 0.08373853
lasso_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,24]), alpha=1)
ridge_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,24]), alpha=0)
qda_fit = train(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, method = “qda”)
#lasso
food_data_test_cv_10 = as.data.frame(food_data_test_cv_10)
food_data_test_cv_10$Confirmed_lasso_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(lasso_fit, s = "lambda.min")
result_lasso[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lasso_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_lasso_predict)
#ridge
food_data_test_cv_10$Confirmed_ridge_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(ridge_fit, s = "lambda.min")
result_ridge[[f]] = mean((kcal_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_ridge_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_ridge_predict)
data.frame_lasso = data.frame(“CV_error” = mean(unlist(result_lasso)), “CV_error_SE” = sd(unlist(result_lasso))) data.frame_ridge = data.frame(“CV_error” = mean(unlist(result_ridge)), “CV_error_SE” = sd(unlist(result_ridge)))